Validation Dataset from Fisch et al., 2018: https://doi.org/10.1016/j.joca.2018.07.012

Libraries and files loading

Extract DE genes from the supplementary files of the paper

library(readxl)
library(tidyverse)
library(ggvenn)
library(RColorBrewer)
library(DESeq2)
library(ggplot2)
library(limma)
library(heatmap3)
library(msigdbr)
library(clusterProfiler)
library(enrichplot)
library(writexl)


MEDIAsave="/home/mclaudia/MEDIA Project/Codici_finali/"
MEDIAgraph="/home/mclaudia/MEDIA Project/Codici_finali/Paper_figures/"

up_genes <- read_excel(paste0(MEDIAsave,"up_dw_genes_consensus.xlsx"), sheet=1)
dw_genes <- read_excel(paste0(MEDIAsave,"up_dw_genes_consensus.xlsx"), sheet=2)
# accessing all the sheets 
sheet = excel_sheets("~/MEDIA Project/NIHMS992829-supplement-Suppl_matl.xlsx")

# applying sheet names to dataframe names
dataframe = lapply(setNames(sheet, sheet), 
                    function(x) read_excel("~/MEDIA Project/NIHMS992829-supplement-Suppl_matl.xlsx", sheet=x,col_names = FALSE))

# attaching all dataframes together
data_frame = bind_rows(dataframe, .id="Sheet")

data_frame <-data_frame[15:12493, 1:5]
df <-data_frame[64:12479,c(1,3:5)]
df2 <-data_frame[1:63,1:4]
df2 <-df2[-c(1:4),]
dff <-bind_rows(df,df2)

colnames(dff) <-c("Sheet","gene","logFC","pv-adj")
dff$logFC <-as.numeric(dff$logFC)
dff$`pv-adj` <-as.numeric(dff$`pv-adj`)
dff <-dff[,-1]
dff <-dff[!duplicated(dff$gene),]
dff <-dff[complete.cases(dff),]

Save gene data from Validation

save(dff, file=paste0(MEDIAsave,"DEgenes_from_validation.RData"))
load(paste0(MEDIAsave,"DEgenes_from_validation.RData"))

Find out how many UP and DOWN genes are also present in the consensus signature

upInDf <-dff[dff$gene %in% up_genes$gene,]
setdiff(up_genes$gene,dff$gene) 
## [1] "ACKR2"  "R3HDML"
upInDf <-upInDf[order(upInDf$logFC, decreasing = TRUE),]
upInDf_sg <-upInDf[upInDf$`pv-adj` <= 0.05,]

dwInDf <-dff[dff$gene %in% dw_genes$gene,]
setdiff(dw_genes$gene,dff$gene) 
## character(0)
dwInDf <-dwInDf[order(dwInDf$logFC, decreasing = TRUE),]
dwInDf_sg <-dwInDf[dwInDf$`pv-adj` <= 0.05,]

Venn Plot of Consensus genes and DE Validation genes

up_v <-dff[dff$logFC >=1.5 & dff$`pv-adj` <= 0.05,] 
up_v <-up_v[complete.cases(up_v),]  #315

dw_v <-dff[dff$logFC <= -1.5 & dff$`pv-adj` <= 0.05,]  
dw_v <-dw_v[complete.cases(dw_v),] #320

DE_Validation <- c(up_v$gene,dw_v$gene)
consensus_sig <- c(up_genes$gene,dw_genes$gene)

length(intersect(dw_v$gene,dw_genes$gene))
## [1] 3
length(intersect(up_v$gene,up_genes$gene))
## [1] 28
dff2 <-dff[dff$gene %in% DE_Validation,]
dff2 <-dff2[complete.cases(dff2$gene),]

v <-list(DE_Validation=DE_Validation, consensus_sig=consensus_sig)
names(v) <-c("DE genes in Validation Dataset","Consensus signature")

ggvenn(v, show_elements = F,show_percentage = T, text_size =17, set_name_size = 10,
       stroke_size = 0.5, fill_color = brewer.pal(name="Set1",n=4))+
ggtitle("Consensus genes differentially expressed in Validation dataset",
          subtitle = "Validation DE: log2FC threshold = |1.5|, pv_adj <= 0.05")+
theme(plot.title = element_text(hjust = 0.5,vjust = 1, size = 25),
        plot.subtitle = element_text(hjust = 0.5 ,vjust = 1, size = 15))

Fisher test to assess significance for the intersection

load(paste0(MEDIAsave,"combined_full_list.RData"))
conting_table <-data.frame("sign_in_Val"=rep(NA,2),"NOTsign_in_Val"=rep(NA,2))
rownames(conting_table) <-c("sign_in_Cons","NOTsign_in_Cons")


conting_table["sign_in_Cons","sign_in_Val"] <-length(intersect(DE_Validation,consensus_sig)) #31
conting_table["sign_in_Cons","NOTsign_in_Val"] <-length(setdiff(consensus_sig,DE_Validation)) #13

comb_notSig <-combined[!combined$gene %in% consensus_sig,] #13071=13115-44
Val_notSig <-dff[!dff$gene %in% dff2$gene,] #11811=12446-635

Val_notCons <-dff2[!dff2$gene %in% consensus_sig,] 

conting_table["NOTsign_in_Cons","sign_in_Val"] <-nrow(Val_notCons) #604
conting_table["NOTsign_in_Cons","NOTsign_in_Val"]<-length(intersect(comb_notSig$gene,Val_notSig$gene)) #10219

conting_table
##                 sign_in_Val NOTsign_in_Val
## sign_in_Cons             31             13
## NOTsign_in_Cons         604          10219
fisher.test(conting_table) 
## 
##  Fisher's Exact Test for Count Data
## 
## data:  conting_table
## p-value < 2.2e-16
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  20.36054 84.20978
## sample estimates:
## odds ratio 
##   40.29889
rm(comb_notSig,Val_notCons,Val_notSig)
write_xlsx(dff2, path=paste0(MEDIAsave,"DEValidationSelected.xlsx"))

png(file = paste0(MEDIAgraph,"Fig5a_venn.png"),  width = 14, height = 8, res=300, units = "in")
ggvenn(v, show_elements = F,show_percentage = T, text_size =14, set_name_size = 9,
       stroke_size = 0.5, fill_color = brewer.pal(name="Set1",n=4))+
ggtitle("Consensus genes differentially expressed in Validation dataset",
          subtitle = "Validation DE: log2FC threshold = |1.5|, pv_adj <= 0.05")+
theme(plot.title = element_text(hjust = 0.5,vjust = 1, size = 25),
        plot.subtitle = element_text(hjust = 0.5 ,vjust = 1, size = 15,margin = margin(0,0,15,0)))
dev.off()
rm(dff2)
norm4dataset <- as.data.frame(read_excel("~/MEDIA Project/GSE114007_raw_counts_4dataset.xlsx", sheet=1))  #Normal samples data
rownames(norm4dataset) <-norm4dataset$symbol
norm4dataset <-norm4dataset[,-1]

dim(norm4dataset) 
## [1] 23710    18
oa4dataset <- as.data.frame(read_excel("~/MEDIA Project/GSE114007_raw_counts_4dataset.xlsx", sheet=2))  #OA samples data
rownames(oa4dataset) <-oa4dataset$symbol
oa4dataset <-oa4dataset[,-1]

dim(oa4dataset)
## [1] 23710    20
norm4dataset <-norm4dataset[rownames(oa4dataset),]
newdataset <-cbind(norm4dataset,oa4dataset)
coldata <-data.frame(class=c(rep("N",18),rep("OA",20)))
coldata$class <-as.factor(coldata$class)
coldata$info <-colnames(newdataset)
coldata$condition <-"cart"
coldata[9:18,"condition"] <-"all"
coldata[19:28,"condition"] <-"cart"
coldata[29:38,"condition"] <-"all"
coldata$condition <-as.factor(coldata$condition)

rownames(coldata) <-coldata$info
newdataset <- newdataset[, rownames(coldata)]

Pre-process Validation dataset

dds0 <- DESeqDataSetFromMatrix(countData = newdataset,
                              colData = coldata,
                              design = ~condition + class)

dds0$class <- factor(dds0$class, levels = c("N","OA"))
dds0$class <- relevel(dds0$class, ref = "N")

dds0$condition <- factor(dds0$condition, levels = c("all","cart"))

keep <- rowSums(counts(dds0)>=5) >= 10  

dds <- dds0[keep,]
nrow(dds)  
## [1] 15961

Save validation pre-processed files

save(dds, file=paste0(MEDIAsave,"validationDatasetPreprocessed.RData"))

Variance Stabilizing Trasformation and Principal component analysis

vsd <- vst(dds, blind=FALSE)

pcaData <- plotPCA(vsd, intgroup=c("class", "condition"), returnData=TRUE)
percentVar <- round(100 * attr(pcaData, "percentVar"))

ggplot(pcaData, aes(PC1, PC2, color=class, shape=condition,label =  rownames(pcaData))) +
  geom_point(size=3,alpha=1) +
  xlab(paste0("PC1: ",percentVar[1],"% variance")) +
  ylab(paste0("PC2: ",percentVar[2],"% variance")) + 
  coord_fixed()+ggtitle("All samples batch control")+ theme_bw()

Batch correction

mm <- model.matrix(~class, colData(vsd))
assay(vsd)<-removeBatchEffect(assay(vsd), batch = vsd$condition,design=mm) 

pcaData <- plotPCA(vsd, intgroup=c("class", "condition"), returnData=TRUE)
percentVar <- round(100 * attr(pcaData, "percentVar"))

ggplot(pcaData, aes(PC1, PC2, color=class, shape=condition,label =  rownames(pcaData))) +
  geom_point(size=3,alpha=1) +
  xlab(paste0("PC1: ",percentVar[1],"% variance")) +
  ylab(paste0("PC2: ",percentVar[2],"% variance")) + 
  coord_fixed()+ggtitle("All samples without batch")+ theme_bw()

HEATMAP of consensus signature on the Validation dataset (z-score), batch corrected

normdata <-assay(vsd)
normdata1 <-t(apply(normdata,1,scale))
colnames(normdata1) <-colnames(normdata)
normdata <-normdata1

signature <-c(up_genes$gene,dw_genes$gene)

setdiff(signature,rownames(normdata)) #"ACKR2""R3HDML"  UP genes missing
## [1] "ACKR2"  "R3HDML"
up2 <-up_genes$gene[up_genes$gene %in% rownames(normdata)]

normdata <-normdata[c(up2,dw_genes$gene),]
dim(normdata)
## [1] 42 38
coldataAnn <-coldata
coldataAnn$col2 <-"plum"
coldataAnn[coldataAnn$class=="OA","col2"] <-"cyan"

rowdata <-data.frame(genes=c(up2,dw_genes$gene))
rowdata$col3 <-c(rep("red",37),rep("blue",5))

heatmap3(normdata[rowdata$genes,rownames(coldataAnn)],
         scale="none",
         col=colorRampPalette(c("royalblue", "white", "red1"))(1024),
         RowAxisColors = 1,
         balanceColor = TRUE,
         margins = c(16,10),
         ColSideColors = coldataAnn$col2,
         ColSideLabs = "Class",
         RowSideColors = rowdata$col3,
         RowSideLabs = "DE genes integration",
         cexRow = 0.9, cexCol = 0.9)
legend(0.8,1,legend=c("OA","Normal"),
       fill=c("cyan","plum"), bty = "n", title = "Class")
legend(0.9,1,legend=c("UP","DOWN"),
       fill=c("red","blue"), bty = "n", title = "DE")

png(file = paste0(MEDIAgraph,"Fig5b_ValHeatmap.png"),  width = 18, height = 11, res=300, units = "in")
heatmap3(normdata[rowdata$genes,rownames(coldataAnn)],
         scale="none",
         col=colorRampPalette(c("royalblue", "white", "red1"))(1024),
         RowAxisColors = 1,
         balanceColor = TRUE,
         margins = c(16,10),
         ColSideColors = coldataAnn$col2,
         ColSideLabs = "Class",
         RowSideColors = rowdata$col3,
         RowSideLabs = "DE genes integration",
         cexRow = .8, cexCol = .9)
legend(0.8,1,legend=c("OA","Normal"),
       fill=c("cyan","plum"), bty = "n", title = "Class")
legend(0.9,1,legend=c("UP","DOWN"),
       fill=c("red","blue"), bty = "n", title = "DE")
dev.off()

Enrichment analysis of consensus signature on Validation Dataset: GSEA Running Plot

sig <-data.frame(gs_name="Consensus signature",gene_symbol=c(up_genes$gene,dw_genes$gene))
colnames(sig) <-c("gs_name","gene_symbol")
colnames(sig) <-c("term_id","gene_id")

gene <-dff$logFC
names(gene) <-dff$gene
gene <-sort(gene,decreasing = TRUE)

set.seed(2459)
gs <- clusterProfiler::GSEA(gene,TERM2GENE = sig,
                             minGSSize = 5,
                             eps = 0,
                             pAdjustMethod = "fdr",
                             pvalueCutoff =0.01)

gseaplot2(gs, title ="Running score GSEA plot - validation dataset", geneSetID = 1, pvalue_table = T)